package lab;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

import java.io.IOException;
import java.util.List;

public class HomePage {//爬取首页面
    public static void main(String[] args) throws IOException {
        //无界面的浏览器(HTTP 客户端)
        try (WebClient webClient = new WebClient(BrowserVersion.CHROME)) {//模拟一共CHROME浏览器
            webClient.getOptions().setJavaScriptEnabled(false);
            webClient.getOptions().setCssEnabled(false);//先关闭浏览器的js，css执行引擎
            HtmlPage page = webClient.getPage("https://so.gushiwen.cn/gushi/tangshi.aspx");

            //如何提取html中我们需要的信息？
            HtmlElement body = page.getBody();
            List<HtmlElement> elements = body.getElementsByAttribute("div",
                    "class",
                    "typecont");//找出所有的div标签中class为typecont的，并存储到List中

            for (HtmlElement element : elements) {
                List<HtmlElement> aElements = element.getElementsByTagName("a");//在第一个div标签中进行筛选
                for (HtmlElement a : aElements) {
                    System.out.println(a.getAttribute("href"));
                }
            }
        }
    }
}
